# Keep things nice and tidy, all libraries go here
library(readxl)
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.5
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(knitr)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(svglite)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
data <- read_excel("data.xlsx", skip = 1)
## New names:
## * `` -> ...35
## * `` -> ...71
data <- data %>% filter(is.na(Exclude))

Visualizing number of publications over time

ggplot(data, aes(x=as.factor(Year))) + 
  geom_bar() +  
  ylab("Number of publications") +
  xlab("Year") + 
  geom_text(stat='count', aes(label=..count..), vjust=2, color="white", size = 2.5) + 
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) 

ggsave("yearly_distribution.eps")
## Saving 7 x 5 in image
# Cleaning not needed values 
data<-data %>% 
    mutate(Academia = replace(Academia, Academia == "?", NA)) %>%
    mutate(Industry = replace(Industry, Industry == "?", NA))

data<-data %>%
  mutate(Type = case_when(is.na(Academia) & is.na(Industry) ~ "None", 
                       Academia == "1.0" & is.na(Industry) ~ "Academia",
                       Industry == "1.0" & is.na(Academia) ~ "Industry", 
                       TRUE ~ "Both"))

Number of publications according to their type

data %>% 
  mutate(Type = fct_infreq(Type, ordered = T)) %>% 
ggplot(aes(x=Type)) + 
  geom_bar(width = .5) +
  xlab("Type of publication") + 
  ylab("Number of publications") + 
  geom_text(stat='count', aes(label=..count..), vjust=3, color="white", size = 4) +
  theme_bw()

ggsave("academia_industry_distribution.eps")
## Saving 7 x 5 in image

Number of publications categorized according to SWEBoK Areas.

A publication can be in more than one category at the same time.

data %>% 
  select(7:21) %>% # selecting columns corresponding to the SWEBoK Areas
  mutate_all(replace_na,0) %>% 
  summarise_all(sum) %>% 
  gather(key = "SWEBOKArea", value = "publications", 1:15) %>% 
  arrange(-publications) %>%  
  mutate(SWEBOKArea = factor(SWEBOKArea, SWEBOKArea)) %>% 
  ggplot(aes(x=SWEBOKArea, y=publications)) + 
  geom_bar(stat="identity") + 
  geom_text(aes(label=publications), vjust=-0.3, color="black", size = 4) + 
  xlab("SWEBoK Area") + 
  ylab("Number of publications") +
  theme_bw()

ggsave("swebok_distribution.eps")
## Saving 7 x 5 in image

Co-occurrences of SWEBoK Areas

swebokareas<-data %>% 
  select(7:21) %>% # selecting columns corresponding to the SWEBoK Areas
  mutate_all(replace_na,0) %>% 
  as.matrix() %>% 
  crossprod()

swebokareas %>% 
  kable()
SR SD SC ST SM SCM SEM SEP SEMM SQ SEPP SEE CF MF EF
SR 49 18 5 2 4 0 7 2 4 0 7 0 0 0 1
SD 18 66 17 3 4 0 6 2 6 1 6 0 0 0 1
SC 5 17 77 5 22 1 3 2 2 0 3 0 0 0 0
ST 2 3 5 12 4 0 1 0 0 0 0 0 0 0 0
SM 4 4 22 4 46 1 2 1 0 0 1 0 0 0 0
SCM 0 0 1 0 1 2 0 1 0 0 0 0 0 0 0
SEM 7 6 3 1 2 0 26 3 1 0 7 3 0 0 1
SEP 2 2 2 0 1 1 3 10 0 0 2 1 0 0 0
SEMM 4 6 2 0 0 0 1 0 8 0 1 0 0 0 0
SQ 0 1 0 0 0 0 0 0 0 6 0 0 0 0 0
SEPP 7 6 3 0 1 0 7 2 1 0 18 3 0 0 1
SEE 0 0 0 0 0 0 3 1 0 0 3 5 0 0 0
CF 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
MF 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
EF 1 1 0 0 0 0 1 0 0 0 1 0 0 0 1
plot_ly(x=c("SR", "SD", "SC", "ST", "SM", "SEM", "SEP", "SEMM", "SQ", "SEPP", "SEE", "CF", "MF", "EF"), y=c("SR", "SD", "SC", "ST", "SM", "SEM", "SEP", "SEMM", "SQ", "SEPP", "SEE", "CF", "MF", "EF"), z=swebokareas, type="heatmap")
x <- data %>% select(7:21, matches('Attention|Memory|Cognitive load|CL$|Problem solving|Reasoning|Decision making|Errors| biases$' )) %>%
  mutate_all(replace_na, 0) %>%
  mutate(`Problem solving`, `Problem solving` = as.numeric(`Problem solving`)) %>% 
  gather(key="SWEBOK", value = pubs, 1:15) %>% # use SWEBOK area as factor
  filter(pubs > 0) %>% # select areas for which there are publications
  group_by(SWEBOK) %>% 
  summarise_all(sum) %>% # number of publication for each area 
  select(-pubs) %>%  # remove pubs to reuse it later
  gather(key = "Taxonomy", value = "count", 2:17) %>%  # count publications in each cognitive taxonomy area
  mutate(label = str_replace(as.character(count), "^0", "")) # add label for later
## Warning: NAs introduced by coercion
# Bubble plot
x <- arrange(x, Taxonomy)
xf<-x$Taxonomy
xfu<-unique(xf)
x$Taxonomy<-factor(xf,levels=xfu)

p<-ggplot(x)
p + geom_point(aes(x = fct_infreq(SWEBOK), y = fct_rev(Taxonomy), size=count), shape=21, fill="white", alpha=0.60) +
geom_text(aes(x = fct_infreq(SWEBOK), y = fct_rev(Taxonomy), label=label), size=2) +
theme(axis.text.x = element_text(angle = 45, hjust = 1.1, size=9,colour="black"), axis.text.y = element_text(size=8,colour="black"), axis.title.x = element_text(size=10), axis.title.y = element_text(size=10,colour = "black",vjust=0.12), panel.grid.major = element_line(linetype = "dashed", size=0.1, color="black"))+
  labs(x="SWEBOK Area",y = "Taxonomy Area") + theme_bw()
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text).

ggsave("swebok_taxonomy_bubble.pdf")
## Saving 7 x 5 in image
## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_text).
# Preparing the dataset for analysing the research methods
data<-data %>%
  mutate(Quantitative = case_when(`Quantit. measures` == 1 | `Task performance` == 1 | `Physiological meas.` == 1 | `Subjective ratings` == 1 | `Behavioral meas.` == 1 ~ 1)) %>% 
  mutate(Quantitative = replace_na(Quantitative, 0)) %>%  
  mutate(Qualitative = case_when(Fieldwork == 1 | Interview == 1 | `Qualit. measures` == 1 | `Task-based` == 1 | `Open observation` == 1 ~ 1)) %>%  
  mutate(Qualitative = replace_na(Qualitative, 0)) %>% 
  mutate(Both = if_else(Qualitative == 1 & Quantitative == 1, 1, 0))

The graphs below are prepared for IEEE Software Submission

Number of publications per year according to SWEBOK areas

# Creating a temp dataset with missing publications years (i.e., year for which there was no publication)
data %>% 
  filter(is.na(Exclude)) %>% 
  select(c(Year, SR:EF)) %>% 
  gather("SWEBOK", "publications", 2:16) %>% 
  mutate_all(replace_na, 0) %>%
  group_by(Year,SWEBOK)  %>% 
  summarise(total=sum(publications)) %>% 
  ggplot(aes(x=as.factor(Year), fill=SWEBOK, y=total)) +  geom_bar(stat="sum") +
  xlab("Year") + ylab("Publications") + scale_fill_discrete(name = "SWEBOK Areas") + guides(size = F) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6))

ggsave("years_swebok.pdf")
## Saving 7 x 5 in image

Evolution of research methods over the years

data <-  data %>% complete(Year=seq(1973,2016)) 

data <-  data %>% 
  mutate(research_method = if_else(Both==1, "Mixed", if_else(Qualitative==1, "Qualitative", "Quantitative"))) 

data %>%  ggplot(aes(x=as.factor(Year), fill=research_method)) + geom_bar() + scale_fill_discrete(name="Research method", labels = c("Mixed", "Qualitative", "Quantitative", ""), na.value = "transparent") + xlab("Year") + ylab("Publications") +   theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 5))

ggsave("years_researchmethods.pdf")
## Saving 7 x 5 in image

Prevalence of research methods in the SWEBOK areas

data.swebok.researchmethod <- data %>% 
  select(7:21, research_method) %>% 
  mutate_all(replace_na,0) %>% 
  group_by(research_method) %>% 
  summarise_at(vars(SR:EF), sum) %>% 
  slice(2:4) %>% 
  gather("SWEBOK", "Publications", 2:16) 

data.swebok.researchmethod %>% 
  ggplot(aes(x=reorder(SWEBOK, Publications, function(x){sum(x)}), y=Publications, fill=research_method)) + geom_bar(stat = "identity") + 
  coord_flip() + xlab("SWEBOK areas") + scale_fill_discrete(name = "Research method")

ggsave("SWBOK_researchmethods.pdf")
## Saving 7 x 5 in image

Distribution of publications

data %>% 
  filter(!is.na(Identifier)) %>%
  select(Identifier, Attention:`Extrinsic CL`, Perception:`Social Cognition`, Fieldwork:`Behavioral meas.`) %>% 
  gather(Taxonomy, value, Attention:`Social Cognition`) %>% 
  filter(!is.na(value)) %>% 
  select(-value) %>% 
  gather(Method, value, Fieldwork:`Behavioral meas.`) %>% 
  filter(!is.na(value)) %>% 
  arrange(Identifier) %>% 
  select(-Identifier, -value) %>%  
  group_by(Taxonomy, Method) %>% 
  tally(name = "Amount") %>% 
  ggplot(aes(x=Method, y=Taxonomy, fill=Amount)) + 
  geom_point(aes(size=Amount), alpha=0.5) + 
  theme(legend.position = "") + theme(axis.text.x = element_text(angle = 30, hjust = 1, size = 8))

ggsave("taxonomy_methods.pdf")
## Saving 7 x 5 in image
data %>% 
  select(SR:EF, Attention:`Extrinsic CL`, Perception:`Social Cognition`) %>%
  mutate_all(replace_na,0) %>% 
  gather(SWEBOK, value, SR:EF) %>% 
  gather(Taxonomy, value2, Attention:`Social Cognition`) %>% 
  count(SWEBOK, Taxonomy, value, value2) %>% 
  mutate(freq=ifelse(value==1 & value2==1, n, 0)) %>% 
  ggplot(aes(fct_rev(SWEBOK), fct_rev(Taxonomy), fill=freq)) + 
  geom_tile() + scale_fill_continuous(low="#fff9f7", high="red") +
  xlab("SWEBOK") + ylab("Topic") + guides(fill=guide_legend(title="")) + 
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 30, hjust = 1, size = 8))

ggsave("taxomony_swebok_cooccurences.pdf")
## Saving 7 x 5 in image
data %>% 
  select(Attention:`Extrinsic CL`, Perception:`Social Cognition`, Fieldwork:`Behavioral meas.`) %>% 
  mutate_all(replace_na,0) %>% 
  gather(Taxonomy, value, Attention:`Social Cognition`) %>% 
  gather(Method, value2, Fieldwork:`Behavioral meas.`) %>% 
  count(Taxonomy, Method, value, value2) %>%
  mutate(freq=ifelse(value==1 & value2==1, n, 0)) %>% 
  ggplot(aes(Method, fct_rev(Taxonomy), fill=freq)) + geom_tile() +
  xlab("Measure") + ylab("Topic") + guides(fill=guide_legend(title="")) + 
  scale_fill_continuous(low="#fff9f7", high="darkgreen") +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 30, hjust = 1, size = 8))

ggsave("taxonomy_method_cooccurences.pdf")
## Saving 7 x 5 in image
data %>% select(Year, Attention:`Extrinsic CL`, Perception:`Social Cognition`) %>% gather("Taxonomy", "publications", Attention:`Social Cognition`) %>% mutate_all(replace_na,0) %>% mutate(publications=as.integer(publications)) %>% group_by(Year, Taxonomy) %>% summarise(total=sum(publications)) %>% ggplot(aes(as.factor(Year), total, fill=Taxonomy)) + geom_bar(stat="sum") +  xlab("Year") + ylab("Publications") + scale_fill_discrete(name = "Taxonomy Areas") + guides(size = F) + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6))
## Warning: NAs introduced by coercion
## Warning: Removed 1 rows containing non-finite values (stat_sum).

df.taxonomy <- data %>% 
  select(Year, Attention:`Extrinsic CL`, Perception:`Social Cognition`) %>%
  gather("Taxonomy", "publications", Attention:`Social Cognition`) %>% 
  mutate_all(replace_na,0) %>% 
  mutate(publications=as.integer(publications)) %>% # for some reseason recognized as char
  filter(publications>0)
## Warning: NAs introduced by coercion
# need to create a separated df to hold the percentage of publications within each year
data.percentage <- df.taxonomy %>% 
  group_by(Year) %>% 
  count(Taxonomy) %>% 
  mutate(ratio = scales::percent(n/sum(n)))

df.taxonomy %>%  
  ggplot(aes(x = as.factor(Year), fill = as.factor(Taxonomy))) + 
  geom_bar(position="fill") +  
  geom_text(data = data.percentage, aes(y = n,label = ratio), position = position_fill(vjust = 0.5), colour = "white", size = 1.3) + 
  xlab("Year") + ylab("Publications %") +
  scale_fill_discrete(name = "Topic") + guides(size = F) +
  scale_y_continuous(labels = percent) + 
  theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8)) + 
  theme(legend.key.size = unit(.2, "cm"), legend.key.width = unit(0.2,"cm"), legend.title = element_text(size = 8), legend.text = element_text(size = 6))

ggsave("taxonomy_years.pdf", width = unit(9, "inch"), height = unit(6.5, "inch"))